knitr::opts_chunk$set(fig.align="center")
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr)
library(ggplot2)
library(magrittr)
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)
theme_set(theme_light())
In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).
Given a search algorithm (bfs or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s average score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle has any meaningful impact on a user’s ratong for these metrics.
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")
user_response_data <- read.csv('processed_ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
mutate(
dataset = as.factor(dataset),
oracle = as.factor(oracle),
search = as.factor(search),
task = as.factor(task)
)
models <- list()
search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
seed = 12
filename = "confidence_udata"
models$confidence_udata <- brm(
formula = bf(confidence.udata ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = filename,
seed = seed
)
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fPIC -Wall -g -O2 -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
## ^
## ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
## ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_udata)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.udata ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 236)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 59)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.04 0.17 0.74 1.44 1.00 868 1592
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -1.91 0.42 -2.71 -1.10 1.00 1639
## Intercept[2] -0.62 0.39 -1.38 0.17 1.00 1643
## Intercept[3] 1.51 0.40 0.75 2.34 1.00 1564
## datasetmovies 0.07 0.32 -0.56 0.68 1.00 1605
## oracledziban 0.28 0.46 -0.61 1.19 1.00 1016
## searchdfs -0.28 0.45 -1.15 0.61 1.00 1301
## task2.RetrieveValue 0.40 0.22 -0.02 0.83 1.00 3761
## task3.Prediction 0.27 0.21 -0.13 0.69 1.00 3805
## task4.Exploration 0.75 0.22 0.30 1.17 1.00 3572
## oracledziban:searchdfs 0.49 0.65 -0.80 1.70 1.01 989
## Tail_ESS
## Intercept[1] 2086
## Intercept[2] 2052
## Intercept[3] 2134
## datasetmovies 1669
## oracledziban 1787
## searchdfs 1367
## task2.RetrieveValue 2444
## task3.Prediction 2595
## task4.Exploration 2198
## oracledziban:searchdfs 1331
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$confidence_udata)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_udata,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$confidence_udata,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data_confidence_udata <- user_response_data %>%
add_predicted_draws(models$confidence_udata,
seed = seed,
re_formula = NA) %>%
group_by(search, oracle, .draw) %>%
mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
confidence_udata_plot <- draw_data_confidence_udata %>%
ggplot(aes(x = oracle, y = rating)) +
stat_eye(.width = c(.95, .5)) +
theme_minimal() +
coord_cartesian(ylim = c(-2, 2)) +
facet_grid(. ~ search)
confidence_udata_plot
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_confidence_udata <- draw_data_confidence_udata %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_confidence_udata
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.954 0.589 1.29 0.95 mean qi
## 2 bfs dziban 1.09 0.766 1.42 0.95 mean qi
## 3 dfs compassql 0.811 0.433 1.13 0.95 mean qi
## 4 dfs dziban 1.19 0.867 1.5 0.95 mean qi
## 5 bfs compassql 0.954 0.839 1.07 0.5 mean qi
## 6 bfs dziban 1.09 0.983 1.2 0.5 mean qi
## 7 dfs compassql 0.811 0.7 0.933 0.5 mean qi
## 8 dfs dziban 1.19 1.08 1.3 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA)
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_udata <- confidence_udata_predictive_data %>%
group_by(search, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = search) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_udata$metric = "confidence.udata"
search_differences$confidence_udata %>%
ggplot(aes(x = diff_in_rating, y = "confidence.udata")) +
xlab(paste0("Expected Difference in Rating (",search_differences$confidence_udata[1,'search'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by oracle.
oracle_differences$confidence_udata <- confidence_udata_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_udata$metric = "confidence.udata"
oracle_differences$confidence_udata %>%
ggplot(aes(x = diff_in_rating, y = "confidence.udata")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$confidence_udata[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
alg_differences$confidence_udata <- confidence_udata_predictive_data %>%
group_by(alg, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = alg) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_udata <- subset(alg_differences$confidence_udata, alg == "dfs compassql - bfs dziban")
alg_differences$confidence_udata$metric = "confidence.udata"
alg_differences$confidence_udata %>%
ggplot(aes(x = diff_in_rating, y = "confidence.udata")) +
xlab(paste0("Expected Difference in Rating (",alg_differences$confidence_udata[1,'alg'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
filename = "confidence_ans"
models$confidence_ans <- brm(
formula = bf(confidence.ans ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = filename,
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_ans)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.ans ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 236)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 59)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.56 0.14 0.27 0.84 1.00 893 915
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -3.09 0.39 -3.88 -2.35 1.00 1937
## Intercept[2] -2.29 0.32 -2.94 -1.69 1.00 1592
## Intercept[3] -1.42 0.29 -2.00 -0.87 1.00 1515
## Intercept[4] 0.28 0.28 -0.28 0.84 1.00 1783
## datasetmovies -0.11 0.21 -0.52 0.31 1.00 2005
## oracledziban 0.41 0.31 -0.19 1.03 1.00 1149
## searchdfs 0.15 0.31 -0.47 0.76 1.00 1104
## task2.RetrieveValue -0.27 0.22 -0.69 0.15 1.00 2386
## task3.Prediction -0.98 0.23 -1.43 -0.53 1.00 2104
## task4.Exploration -0.63 0.22 -1.07 -0.21 1.00 2341
## oracledziban:searchdfs -0.15 0.44 -1.01 0.71 1.00 891
## Tail_ESS
## Intercept[1] 2069
## Intercept[2] 1917
## Intercept[3] 1533
## Intercept[4] 2179
## datasetmovies 1900
## oracledziban 1570
## searchdfs 1675
## task2.RetrieveValue 2147
## task3.Prediction 1996
## task4.Exploration 1954
## oracledziban:searchdfs 1110
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$confidence_ans)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_ans,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$confidence_ans,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data_confidence_ans <- user_response_data %>%
add_predicted_draws(models$confidence_ans,
seed = seed,
re_formula = NA) %>%
group_by(search, oracle, .draw) %>%
mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
confidence_ans_plot <- draw_data_confidence_ans %>%
ggplot(aes(x = oracle, y = rating)) +
stat_eye(.width = c(.95, .5)) +
theme_minimal() +
coord_cartesian(ylim = c(-2, 2)) +
facet_grid(. ~ search)
confidence_ans_plot
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_confidence_ans <- draw_data_confidence_ans %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_confidence_ans
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.958 0.589 1.29 0.95 mean qi
## 2 bfs dziban 1.21 0.917 1.48 0.95 mean qi
## 3 dfs compassql 1.05 0.7 1.35 0.95 mean qi
## 4 dfs dziban 1.21 0.9 1.5 0.95 mean qi
## 5 bfs compassql 0.958 0.839 1.07 0.5 mean qi
## 6 bfs dziban 1.21 1.12 1.32 0.5 mean qi
## 7 dfs compassql 1.05 0.933 1.17 0.5 mean qi
## 8 dfs dziban 1.21 1.12 1.32 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA)
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_ans <- confidence_ans_predictive_data %>%
group_by(search, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = search) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_ans$metric = "confidence.ans"
search_differences$confidence_ans %>%
ggplot(aes(x = diff_in_rating, y = "confidence.ans")) +
xlab(paste0("Expected Difference in Rating (",search_differences$confidence_ans[1,'search'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by oracle.
oracle_differences$confidence_ans <- confidence_ans_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_ans$metric = "confidence.ans"
oracle_differences$confidence_ans %>%
ggplot(aes(x = diff_in_rating, y = "confidence.ans")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$confidence_ans[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
alg_differences$confidence_ans <- confidence_ans_predictive_data %>%
group_by(alg, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = alg) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_ans <- subset(alg_differences$confidence_ans, alg == "dfs compassql - bfs dziban")
alg_differences$confidence_ans$metric = "confidence.ans"
alg_differences$confidence_ans %>%
ggplot(aes(x = diff_in_rating, y = "confidence.ans")) +
xlab(paste0("Expected Difference in Rating (",alg_differences$confidence_ans[1,'alg'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
filename = "efficiency"
models$efficiency <- brm(
formula = bf(efficiency ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = filename,
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$efficiency)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: efficiency ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 236)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 59)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.06 0.16 0.77 1.41 1.00 881 1518
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.50 0.43 -3.38 -1.69 1.00 809
## Intercept[2] -0.94 0.40 -1.74 -0.17 1.00 814
## Intercept[3] -0.11 0.40 -0.89 0.66 1.00 816
## Intercept[4] 1.12 0.40 0.32 1.90 1.00 874
## datasetmovies 0.19 0.31 -0.47 0.81 1.00 762
## oracledziban 0.07 0.46 -0.86 0.97 1.00 673
## searchdfs -1.00 0.46 -1.94 -0.11 1.00 640
## task2.RetrieveValue -0.28 0.20 -0.69 0.11 1.00 2389
## task3.Prediction 0.19 0.20 -0.21 0.58 1.00 2177
## task4.Exploration 0.35 0.21 -0.04 0.76 1.00 2171
## oracledziban:searchdfs 0.60 0.63 -0.65 1.87 1.00 655
## Tail_ESS
## Intercept[1] 1318
## Intercept[2] 1596
## Intercept[3] 1578
## Intercept[4] 1628
## datasetmovies 939
## oracledziban 1136
## searchdfs 1245
## task2.RetrieveValue 2108
## task3.Prediction 2170
## task4.Exploration 2106
## oracledziban:searchdfs 1047
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$efficiency)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$efficiency,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$efficiency,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data_efficiency <- user_response_data %>%
add_predicted_draws(models$efficiency,
seed = seed,
re_formula = NA) %>%
group_by(search, oracle, .draw) %>%
mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
efficiency_plot <- draw_data_efficiency %>%
ggplot(aes(x = oracle, y = rating)) +
stat_eye(.width = c(.95, .5)) +
theme_minimal() +
coord_cartesian(ylim = c(-2, 2)) +
facet_grid(. ~ search)
efficiency_plot
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_efficiency <- draw_data_efficiency %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_efficiency
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.622 0.0357 1.23 0.95 mean qi
## 2 bfs dziban 0.686 0.1 1.22 0.95 mean qi
## 3 dfs compassql -0.234 -0.783 0.333 0.95 mean qi
## 4 dfs dziban 0.340 -0.217 0.917 0.95 mean qi
## 5 bfs compassql 0.622 0.411 0.839 0.5 mean qi
## 6 bfs dziban 0.686 0.5 0.883 0.5 mean qi
## 7 dfs compassql -0.234 -0.433 -0.0333 0.5 mean qi
## 8 dfs dziban 0.340 0.133 0.55 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA)
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$efficiency <- efficiency_predictive_data %>%
group_by(search, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = search) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$efficiency$metric = "efficiency"
search_differences$efficiency %>%
ggplot(aes(x = diff_in_rating, y = "efficiency")) +
xlab(paste0("Expected Difference in Rating (",search_differences$efficiency[1,'search'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by oracle.
oracle_differences$efficiency <- efficiency_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$efficiency$metric = "efficiency"
oracle_differences$efficiency %>%
ggplot(aes(x = diff_in_rating, y = "efficiency")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$efficiency[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
alg_differences$efficiency <- efficiency_predictive_data %>%
group_by(alg, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = alg) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$efficiency <- subset(alg_differences$efficiency, alg == "dfs compassql - bfs dziban")
alg_differences$efficiency$metric = "efficiency"
alg_differences$efficiency %>%
ggplot(aes(x = diff_in_rating, y = "efficiency")) +
xlab(paste0("Expected Difference in Rating (",alg_differences$efficiency[1,'alg'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
filename = "ease_of_use"
models$ease_of_use <- brm(
formula = bf(ease.of.use ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = filename,
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$ease_of_use)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: ease.of.use ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 236)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 59)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.03 0.16 0.75 1.37 1.00 957 1411
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.76 0.44 -3.63 -1.93 1.00 1453
## Intercept[2] -1.25 0.39 -2.00 -0.51 1.00 1410
## Intercept[3] -0.38 0.38 -1.13 0.36 1.00 1401
## Intercept[4] 1.49 0.40 0.74 2.28 1.00 1378
## datasetmovies 0.39 0.32 -0.23 1.01 1.00 1321
## oracledziban -0.18 0.44 -1.05 0.67 1.00 1231
## searchdfs -1.19 0.43 -2.05 -0.36 1.00 1125
## task2.RetrieveValue 0.29 0.21 -0.12 0.71 1.00 3396
## task3.Prediction 0.45 0.21 0.05 0.86 1.00 3060
## task4.Exploration 0.50 0.21 0.10 0.90 1.00 3108
## oracledziban:searchdfs 0.70 0.61 -0.51 1.94 1.00 1181
## Tail_ESS
## Intercept[1] 1714
## Intercept[2] 1652
## Intercept[3] 1611
## Intercept[4] 1487
## datasetmovies 1557
## oracledziban 1632
## searchdfs 1697
## task2.RetrieveValue 2232
## task3.Prediction 2398
## task4.Exploration 2263
## oracledziban:searchdfs 1690
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$ease_of_use)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$ease_of_use,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$ease_of_use,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data_ease_of_use <- user_response_data %>%
add_predicted_draws(models$ease_of_use,
seed = seed,
re_formula = NA) %>%
group_by(search, oracle, .draw) %>%
mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
ease_of_use_plot <- draw_data_ease_of_use %>%
ggplot(aes(x = oracle, y = rating)) +
stat_eye(.width = c(.95, .5)) +
theme_minimal() +
coord_cartesian(ylim = c(-2, 2)) +
facet_grid(. ~ search)
ease_of_use_plot
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_ease_of_use <- draw_data_ease_of_use %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_ease_of_use
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.918 0.482 1.30 0.95 mean qi
## 2 bfs dziban 0.815 0.35 1.2 0.95 mean qi
## 3 dfs compassql 0.0884 -0.417 0.567 0.95 mean qi
## 4 dfs dziban 0.478 -0.05 0.95 0.95 mean qi
## 5 bfs compassql 0.918 0.786 1.05 0.5 mean qi
## 6 bfs dziban 0.815 0.683 0.967 0.5 mean qi
## 7 dfs compassql 0.0884 -0.0833 0.267 0.5 mean qi
## 8 dfs dziban 0.478 0.317 0.65 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA)
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$ease_of_use <- ease_of_use_predictive_data %>%
group_by(search, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = search) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$ease_of_use$metric = "ease.of.use"
search_differences$ease_of_use %>%
ggplot(aes(x = diff_in_rating, y = "ease.of.use")) +
xlab(paste0("Expected Difference in Rating (",search_differences$ease_of_use[1,'search'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by oracle.
oracle_differences$ease_of_use <- ease_of_use_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$ease_of_use$metric = "ease.of.use"
oracle_differences$ease_of_use %>%
ggplot(aes(x = diff_in_rating, y = "ease.of.use")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$ease_of_use[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
alg_differences$ease_of_use <- ease_of_use_predictive_data %>%
group_by(alg, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = alg) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$ease_of_use <- subset(alg_differences$ease_of_use, alg == "dfs compassql - bfs dziban")
alg_differences$ease_of_use$metric = "ease.of.use"
alg_differences$ease_of_use %>%
ggplot(aes(x = diff_in_rating, y = "ease.of.use")) +
xlab(paste0("Expected Difference in Rating (",alg_differences$ease_of_use[1,'alg'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
filename = "utility"
models$utility <- brm(
formula = bf(utility ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = filename,
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$utility)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: utility ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 236)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 59)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.93 0.15 0.67 1.25 1.00 692 1577
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -1.75 0.37 -2.52 -1.02 1.00 678
## Intercept[2] -0.51 0.36 -1.22 0.22 1.00 732
## Intercept[3] 0.14 0.36 -0.58 0.84 1.00 708
## Intercept[4] 1.38 0.37 0.68 2.14 1.00 762
## datasetmovies 0.27 0.28 -0.28 0.84 1.00 917
## oracledziban 0.19 0.42 -0.65 1.02 1.00 778
## searchdfs -0.60 0.40 -1.35 0.23 1.00 761
## task2.RetrieveValue -0.21 0.20 -0.62 0.18 1.00 1717
## task3.Prediction 0.32 0.20 -0.07 0.71 1.00 1781
## task4.Exploration 0.58 0.21 0.17 0.97 1.00 1777
## oracledziban:searchdfs 0.27 0.59 -0.89 1.40 1.00 683
## Tail_ESS
## Intercept[1] 925
## Intercept[2] 1371
## Intercept[3] 1381
## Intercept[4] 1285
## datasetmovies 1292
## oracledziban 1106
## searchdfs 1214
## task2.RetrieveValue 2148
## task3.Prediction 2217
## task4.Exploration 2030
## oracledziban:searchdfs 995
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$utility)
s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$utility,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$utility,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data_utility <- user_response_data %>%
add_predicted_draws(models$utility,
seed = seed,
re_formula = NA) %>%
group_by(search, oracle, .draw) %>%
mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
utility_plot <- draw_data_utility %>%
ggplot(aes(x = oracle, y = rating)) +
stat_eye(.width = c(.95, .5)) +
theme_minimal() +
coord_cartesian(ylim = c(-2, 2)) +
facet_grid(. ~ search)
utility_plot
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_utility <- draw_data_utility %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_utility
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.464 -0.143 1.02 0.95 mean qi
## 2 bfs dziban 0.641 0.05 1.2 0.95 mean qi
## 3 dfs compassql -0.0936 -0.667 0.483 0.95 mean qi
## 4 dfs dziban 0.345 -0.25 0.917 0.95 mean qi
## 5 bfs compassql 0.464 0.268 0.661 0.5 mean qi
## 6 bfs dziban 0.641 0.45 0.833 0.5 mean qi
## 7 dfs compassql -0.0936 -0.3 0.1 0.5 mean qi
## 8 dfs dziban 0.345 0.15 0.55 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA)
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$utility <- utility_predictive_data %>%
group_by(search, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = search) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$utility$metric = "utility"
search_differences$utility %>%
ggplot(aes(x = diff_in_rating, y = "utility")) +
xlab(paste0("Expected Difference in Rating (",search_differences$utility[1,'search'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by oracle.
oracle_differences$utility <- utility_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$utility$metric = "utility"
oracle_differences$utility %>%
ggplot(aes(x = diff_in_rating, y = "utility")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$utility[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
alg_differences$utility <- utility_predictive_data %>%
group_by(alg, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = alg) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$utility <- subset(alg_differences$utility, alg == "dfs compassql - bfs dziban")
alg_differences$utility$metric = "utility"
alg_differences$utility %>%
ggplot(aes(x = diff_in_rating, y = "utility")) +
xlab(paste0("Expected Difference in Rating (",alg_differences$utility[1,'alg'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
filename = "overall"
models$overall <- brm(
formula = bf(overall ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = filename,
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$overall)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: overall ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 236)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 59)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.40 0.20 1.05 1.81 1.01 898 1147
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -3.05 0.54 -4.17 -1.98 1.00 1132
## Intercept[2] -1.68 0.49 -2.68 -0.71 1.00 1053
## Intercept[3] -0.39 0.47 -1.32 0.54 1.00 1022
## Intercept[4] 1.69 0.49 0.74 2.67 1.00 1025
## datasetmovies -0.17 0.41 -0.98 0.62 1.00 1032
## oracledziban 0.10 0.57 -0.99 1.23 1.00 842
## searchdfs -0.71 0.57 -1.82 0.41 1.00 885
## task2.RetrieveValue -0.03 0.21 -0.43 0.37 1.00 3706
## task3.Prediction 0.38 0.21 -0.04 0.79 1.00 3639
## task4.Exploration 0.66 0.22 0.24 1.10 1.00 3240
## oracledziban:searchdfs 0.38 0.81 -1.25 1.95 1.00 867
## Tail_ESS
## Intercept[1] 1454
## Intercept[2] 1394
## Intercept[3] 1398
## Intercept[4] 1523
## datasetmovies 1507
## oracledziban 1557
## searchdfs 1292
## task2.RetrieveValue 2394
## task3.Prediction 2516
## task4.Exploration 2627
## oracledziban:searchdfs 1228
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$overall)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$overall,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$overall,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data_overall <- user_response_data %>%
add_predicted_draws(models$overall,
seed = seed,
re_formula = NA) %>%
group_by(search, oracle, .draw) %>%
mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
overall_plot <- draw_data_overall %>%
ggplot(aes(x = oracle, y = rating)) +
stat_eye(.width = c(.95, .5)) +
theme_minimal() +
coord_cartesian(ylim = c(-2, 2)) +
facet_grid(. ~ search)
overall_plot
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_overall <- draw_data_overall %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_overall
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.723 0.196 1.16 0.95 mean qi
## 2 bfs dziban 0.774 0.283 1.2 0.95 mean qi
## 3 dfs compassql 0.294 -0.267 0.783 0.95 mean qi
## 4 dfs dziban 0.584 0.05 1.03 0.95 mean qi
## 5 bfs compassql 0.723 0.571 0.893 0.5 mean qi
## 6 bfs dziban 0.774 0.633 0.933 0.5 mean qi
## 7 dfs compassql 0.294 0.117 0.483 0.5 mean qi
## 8 dfs dziban 0.584 0.433 0.767 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA)
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$overall <- overall_predictive_data %>%
group_by(search, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = search) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$overall$metric = "overall"
search_differences$overall %>%
ggplot(aes(x = diff_in_rating, y = "overall")) +
xlab(paste0("Expected Difference in Rating (",search_differences$overall[1,'search'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by oracle.
oracle_differences$overall <- overall_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"
oracle_differences$overall %>%
ggplot(aes(x = diff_in_rating, y = "overall")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
alg_differences$overall <- overall_predictive_data %>%
group_by(alg, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = alg) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$overall <- subset(alg_differences$overall, alg == "dfs compassql - bfs dziban")
alg_differences$overall$metric = "overall"
alg_differences$overall %>%
ggplot(aes(x = diff_in_rating, y = "overall")) +
xlab(paste0("Expected Difference in Rating (",alg_differences$overall[1,'alg'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.
combined_search_differences <- rbind(search_differences$confidence_udata, search_differences$confidence_ans, search_differences$efficiency, search_differences$ease_of_use, search_differences$utility, search_differences$overall)
combined_search_differences$metric <- factor(combined_search_differences$metric, levels=rev(analyses))
# flip order so that we get bfs - dfs
if(combined_search_differences[1,'search']=="dfs - bfs"){
combined_search_differences$search = 'bfs - dfs'
combined_search_differences$diff_in_rating = -1 * combined_search_differences$diff_in_rating
}
combined_search_differences_confidence <- subset(combined_search_differences, metric %in% confidence_metrics)
search_differences_plot_confidence <- combined_search_differences_confidence %>%
ggplot(aes(x = diff_in_rating, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_search_differences_confidence[1,'search'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
search_differences_plot_confidence
View intervals
fit_info_search_differences_confidence <- combined_search_differences_confidence %>% group_by(search, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_search_differences_confidence
## # A tibble: 4 x 8
## # Groups: search [1]
## search metric diff_in_rating .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs - dfs confidence.ans -0.0398 -0.352 0.268 0.95 mean qi
## 2 bfs - dfs confidence.ud… 0.0261 -0.312 0.370 0.95 mean qi
## 3 bfs - dfs confidence.ans -0.0398 -0.147 0.0649 0.5 mean qi
## 4 bfs - dfs confidence.ud… 0.0261 -0.0917 0.145 0.5 mean qi
combined_search_differences_preference <- subset(combined_search_differences, metric %in% preference_metrics)
search_differences_plot_preference <- combined_search_differences_preference %>%
ggplot(aes(x = diff_in_rating, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_search_differences_preference[1,'search'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
search_differences_plot_preference
View intervals
fit_info_search_differences_preference <- combined_search_differences_preference %>% group_by(search, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_search_differences_preference
## # A tibble: 8 x 8
## # Groups: search [1]
## search metric diff_in_rating .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs - dfs overall 0.310 -0.185 0.814 0.95 mean qi
## 2 bfs - dfs utility 0.430 -0.158 1.00 0.95 mean qi
## 3 bfs - dfs ease.of.use 0.581 0.114 1.06 0.95 mean qi
## 4 bfs - dfs efficiency 0.602 0.0224 1.19 0.95 mean qi
## 5 bfs - dfs overall 0.310 0.152 0.470 0.5 mean qi
## 6 bfs - dfs utility 0.430 0.239 0.631 0.5 mean qi
## 7 bfs - dfs ease.of.use 0.581 0.423 0.739 0.5 mean qi
## 8 bfs - dfs efficiency 0.602 0.396 0.805 0.5 mean qi
combined_oracle_differences <- rbind(oracle_differences$confidence_udata, oracle_differences$confidence_ans, oracle_differences$efficiency, oracle_differences$ease_of_use, oracle_differences$utility, oracle_differences$overall)
combined_oracle_differences$metric <- factor(combined_oracle_differences$metric, levels=rev(analyses))
combined_oracle_differences_confidence <- subset(combined_oracle_differences, metric %in% confidence_metrics)
oracle_differences_plot_confidence <- combined_oracle_differences_confidence %>%
ggplot(aes(x = diff_in_rating, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_confidence[1,'oracle'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences_plot_confidence
View intervals
fit_info_oracle_differences_confidence <- combined_oracle_differences_confidence %>% group_by(oracle, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_oracle_differences_confidence
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle metric diff_in_rating .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - co… confidence… 0.207 -0.105 0.527 0.95 mean qi
## 2 dziban - co… confidence… 0.261 -0.0652 0.604 0.95 mean qi
## 3 dziban - co… confidence… 0.207 0.105 0.312 0.5 mean qi
## 4 dziban - co… confidence… 0.261 0.143 0.378 0.5 mean qi
combined_oracle_differences_preference <- subset(combined_oracle_differences, metric %in% preference_metrics)
oracle_differences_plot_preference <- combined_oracle_differences_preference %>%
ggplot(aes(x = diff_in_rating, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_preference[1,'oracle'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences_plot_preference
View intervals
fit_info_oracle_differences_preference <- combined_oracle_differences_preference %>% group_by(oracle, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_oracle_differences_preference
## # A tibble: 8 x 8
## # Groups: oracle [1]
## oracle metric diff_in_rating .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - comp… overall 0.178 -0.295 0.670 0.95 mean qi
## 2 dziban - comp… utility 0.317 -0.266 0.871 0.95 mean qi
## 3 dziban - comp… ease.of… 0.158 -0.297 0.618 0.95 mean qi
## 4 dziban - comp… efficie… 0.334 -0.237 0.923 0.95 mean qi
## 5 dziban - comp… overall 0.178 0.0138 0.338 0.5 mean qi
## 6 dziban - comp… utility 0.317 0.122 0.512 0.5 mean qi
## 7 dziban - comp… ease.of… 0.158 -0.00158 0.318 0.5 mean qi
## 8 dziban - comp… efficie… 0.334 0.142 0.527 0.5 mean qi
combined_alg_differences <- rbind(alg_differences$confidence_udata, alg_differences$confidence_ans, alg_differences$efficiency, alg_differences$ease_of_use, alg_differences$utility, alg_differences$overall)
combined_alg_differences$metric <- factor(combined_alg_differences$metric, levels=rev(analyses))
# flip order so that we get bfs - dfs
if(combined_alg_differences[1,'alg']=="dfs - bfs"){
combined_alg_differences$alg = 'bfs - dfs'
combined_alg_differences$diff_in_rating = -1 * combined_alg_differences$diff_in_rating
}
combined_alg_differences_confidence <- subset(combined_alg_differences, metric %in% confidence_metrics)
alg_differences_plot_confidence <- combined_alg_differences_confidence %>%
ggplot(aes(x = diff_in_rating, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_alg_differences_confidence[1,'alg'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
alg_differences_plot_confidence
View intervals
fit_info_alg_differences_confidence <- combined_alg_differences_confidence %>% group_by(alg, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_alg_differences_confidence
## # A tibble: 4 x 8
## # Groups: alg [1]
## alg metric diff_in_rating .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs compassql… confiden… -0.165 -0.6 0.283 0.95 mean qi
## 2 dfs compassql… confiden… -0.282 -0.783 0.217 0.95 mean qi
## 3 dfs compassql… confiden… -0.165 -0.3 -0.0167 0.5 mean qi
## 4 dfs compassql… confiden… -0.282 -0.450 -0.117 0.5 mean qi
combined_alg_differences_preference <- subset(combined_alg_differences, metric %in% preference_metrics)
alg_differences_plot_preference <- combined_alg_differences_preference %>%
ggplot(aes(x = diff_in_rating, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_alg_differences_preference[1,'alg'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
alg_differences_plot_preference
View intervals
fit_info_alg_differences_preference <- combined_alg_differences_preference %>% group_by(alg, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_alg_differences_preference
## # A tibble: 8 x 8
## # Groups: alg [1]
## alg metric diff_in_rating .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs compassql … overall -0.480 -1.17 0.2 0.95 mean qi
## 2 dfs compassql … utility -0.734 -1.53 0.0833 0.95 mean qi
## 3 dfs compassql … ease.of… -0.727 -1.38 -0.0500 0.95 mean qi
## 4 dfs compassql … efficie… -0.920 -1.70 -0.117 0.95 mean qi
## 5 dfs compassql … overall -0.480 -0.717 -0.233 0.5 mean qi
## 6 dfs compassql … utility -0.734 -1 -0.467 0.5 mean qi
## 7 dfs compassql … ease.of… -0.727 -0.95 -0.5 0.5 mean qi
## 8 dfs compassql … efficie… -0.920 -1.2 -0.650 0.5 mean qi